## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 555049 29.7 1242625 66.4 686457 36.7
## Vcells 1018217 7.8 8388608 64.0 1876634 14.4
library(magrittr)
library(data.table)
library(knitr)
`%!in%` = Negate(`%in%`)
`%nin%` = Negate(`%in%`)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))# fp = file.path('..', 'input', 'ath-annot')
fp = file.path('..', 'input', 'Mercator')
# fn = 'ath_Araport11_2018-05-25_mapping.txt.gz'
# fn = 'X4.4_Arabidopsis_thaliana.txt'
fn = 'ath_Mercator4v7_results.txt'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]
combined = gmm[, .(
BINCODE = paste(unique(BINCODE), collapse = " | "),
NAME = paste(unique(NAME), collapse = " | "),
DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]
combined$IDENTIFIER = sub("\\'", '', sub("\\..*$", "", toupper(combined$IDENTIFIER)))
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)
colnames(combined)[2:4] = paste('ath', colnames(combined)[2:4], sep = '_')
ath.gmm = combinednote: some duplicated ids in PSS
fp = file.path('..', 'input', 'ath-annot', 'Phytozome', 'PhytozomeV12',
'early_release', 'Athaliana_447_Araport11', 'annotation')
# fn = 'Araport11_GFF3_genes_transposons.current_utf8_attributes_CB.tsv'
fn = 'Athaliana_447_Araport11.geneName.txt'
gn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
colnames(gn)[2] = 'athName'
gn$V1 = sub('\\..*', '', gn$V1)
gn = gn[!duplicated(gn), ]
fn = 'Athaliana_447_Araport11.synonym.txt'
sn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
sn[, merged_column := apply(.SD, 1, function(x) {
# Remove NA and empty strings
x = x[!is.na(x) & x != ""]
paste(x, collapse = " | ")
}), .SDcols = 2:ncol(sn)]
# Optionally, remove the original columns V2 to V15
sn[, (2:(ncol(sn)-1)) := NULL]
colnames(sn)[2] = 'athSynonims'
sn$V1 = sub('\\..*', '', sn$V1)
sn = sn[!duplicated(sn), ]
fp = file.path('..', 'input', 'SKM_2025-07-08')
fn = 'rxn-nodes-public.tsv'
pss = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ind = grep('^name$|^all_pathways|^short_name$', colnames(pss), value = TRUE)
pss = pss[, ..ind]
ind = grep('\\[', pss$name)
pss = pss[ind, ]
pss[, ids_string := stringr::str_extract(name, "(?<=\\[)[^\\]]+(?=\\])")]
pss[, ids_list := strsplit(ids_string, split = ",")]
max_ids = max(lengths(pss$ids_list))
for (i in seq_len(max_ids)) {
pss[[paste0("id_", i)]] = sapply(pss$ids_list, function(x) ifelse(length(x) >= i, x[i], NA))
}
pss[, c("ids_string", "ids_list") := NULL]
pss_long = melt(
pss,
id.vars = c("name", "all_pathways", 'short_name'), # Columns to keep as is
measure.vars = patterns("^id_"), # Columns to melt (all starting with "id_")
variable.name = "id_num", # Name for the melted variable column
value.name = "id" # Name for the melted value column
)
pss_long = pss_long[!is.na(id) & id != ""]
pss_long[, id_num := NULL]
pss_long[, name := NULL]
pss_long$id = sub('\\..*', '', pss_long$id)
pss_long = pss_long[!duplicated(pss_long), ]
table(duplicated(pss_long$id))##
## FALSE TRUE
## 816 24
pss_long %>%
dplyr::filter(id %in% id[duplicated(id)] & stringr::str_starts(id, "^AT")) %>%
dplyr::arrange(id) %>%
print()## all_pathways short_name id
## <char> <char> <char>
## 1: Hormone - Ethylene (ET) ORA59 AT1G06160
## 2: Hormone - Ethylene (ET) ERF/ORA59 AT1G06160
## 3: Hormone - Salicylic acid (SA) TCP8 AT1G58100
## 4: Hormone - Salicylic acid (SA) TCP8,14,15 AT1G58100
## 5: Hormone - Ethylene (ET) EDF2 AT1G68840
## 6: Hormone - Ethylene (ET) ERF/EDF AT1G68840
## 7: Signalling - Heat-shock proteins (HSPs),Stress - Heat HSP70 AT3G12580
## 8: Signalling - Heat-shock proteins (HSPs) HSP AT3G12580
## 9: Hormone - Ethylene (ET) ERF1 AT3G23240
## 10: Hormone - Ethylene (ET) ERF/EDF AT3G23240
## 11: Hormone - Ethylene (ET) ERF6 AT4G17490
## 12: Hormone - Ethylene (ET) ERF/ORA59 AT4G17490
## 13: Hormone - Ethylene (ET) ERF1 AT4G17500
## 14: Hormone - Ethylene (ET) ERF/EDF AT4G17500
## 15: Signalling - Heat-shock proteins (HSPs) MED37E AT5G02500
## 16: Signalling - Heat-shock proteins (HSPs) HSP AT5G02500
## 17: Hormone - Ethylene (ET) ERF096 AT5G43410
## 18: Hormone - Ethylene (ET) ERF/EDF AT5G43410
## 19: Hormone - Ethylene (ET) ERF5 AT5G47230
## 20: Hormone - Ethylene (ET) ERF/ORA59 AT5G47230
## 21: Hormone - Ethylene (ET) ERF105 AT5G51190
## 22: Hormone - Ethylene (ET) ERF/ORA59 AT5G51190
## 23: Signalling - Heat-shock proteins (HSPs) HSP18.1-CI AT5G59720
## 24: Signalling - Heat-shock proteins (HSPs) HSP AT5G59720
## 25: Hormone - Ethylene (ET) ERF104 AT5G61600
## 26: Hormone - Ethylene (ET) ERF/EDF AT5G61600
## all_pathways short_name id
pss_long = pss_long %>%
dplyr::filter(stringr::str_starts(id, "AT")) %>%
dplyr::group_by(id) %>%
dplyr::summarise(
dplyr::across(
.cols = dplyr::everything(),
.fns = ~ {
vals = unique(na.omit(.))
if (length(vals) > 1) paste(vals, collapse = " | ")
else if (length(vals) == 1) vals
else NA_character_
}
),
.groups = "drop"
)| Plant Name | Label | JCVI-MCScan | Compara Plants | Plaza | OrthoDB | FastOma | RBH | Mercator |
|---|---|---|---|---|---|---|---|---|
| Malus domestica | apple | mdo_GDv1 | malus_domestica_golden | mdo | mdo | mdo | mdo | mdo |
| mdo_HChap1 | ||||||||
| Prunus persica | ppe | ppe | prunus_persica | ppe | ppe | pper | ppe | pper |
| Prunus dulcis / P. amygdalus | almond | almond | prunus_dulcis | pdul | pdul | pdul | pdul | |
| Prunus avium | wild cherry | wildcherry | prunus_avium | pavi | pavi | pavi | pavi | |
| Prunus armeniaca | apricot | apricot | parm | parm | parm | parm | ||
| Prunus cerasifera | cherry plum | cherryplum | pcer | pcer | pcer | |||
| Pyrus | pear | pear | pcox | pcox | pcox | |||
| Prunus sibirica | Siberian apricot | siberianapricot | psib | psib | psib |
# in OrthoDB and PLAZA
# mdo_HChap1/mdo/mdo
# ppe/ppe/pper
# in OrthoDB, no PLAZA
# pdul/pdul/pdul
# wildcherry/pavi/pavi
# apricot/parm/parm
# not in OrthoDB or PLAZA
# pear/pcox/pcox
# cherryplum/pcer/pcer
# siberianapricot/psib/psibparams_list <- list(
plantName1 = 'mdo'
, # change name - PLAZA, OrthoDB, RBH
plantName2 = 'malus_domestica_golden'
, # change name - compara # sources
plantName3 = 'mdo_HChap1'
, # change name - MCScanX # sources
plantName4 = 'mdo'
, # change name - FastOMA # sources
plantDirIn = "mdo_apple"
,# inconsistent-IDs, orthofinder
plantNameOut = "apple"
,
plantDirOut = file.path('..', 'reports', 'fruitTrees', "apple")
,
pattern_in = "\\.[^.]*$"
, # everythin after the last dot
pattern_out = ""
, # all-IDs
compara_pattern_in1 = '\\..*'
,
compara_pattern_out1 = ""
,
compara_pattern_in2 = ".*_"
,
compara_pattern_out2 = ""
,
plaza_pattern_in1 = '\\..*'
,
plaza_pattern_in2 = ".* "
,
ref_genome = "g.Honeycrisp_HAP1_braker1+2_combined_fullSupport_longname_filtered.pep"
, # inconsistent-IDs
mercator = 'mdo_Mercator4v7_results.txt'
, # plant-gmm
mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']"
, # plant-gmm
mercatorPatternOut1 = ""
, # plant-gmm
mercatorPatternIn2 = "a.g"
, # plant-gmm
mercatorPatternOut2 = "A.g"
, # plant-gmm
flag1 = 1
,
flag2 = 1
,
flag3 = FALSE
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x0000024eb2a4aaf0>
##
##
## processing file: ./08_fruitTrees-child1.rmd
| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-10] | |….. | 9% | |…… | 12% [unnamed-chunk-11] | |…….. | 15% | |……… | 18% [unnamed-chunk-12] | |……….. | 21% | |………… | 24% [unnamed-chunk-13] | |………….. | 27% | |…………… | 30% [unnamed-chunk-14] | |…………….. | 33% | |………………. | 36% [unnamed-chunk-15] | |……………….. | 39% | |…………………. | 42% [unnamed-chunk-16] | |………………….. | 45% | |……………………. | 48% [unnamed-chunk-17] | |…………………….. | 52% | |………………………. | 55% [unnamed-chunk-18] | |……………………….. | 58% | |…………………………. | 61% [unnamed-chunk-19] | |………………………….. | 64% | |……………………………. | 67% [unnamed-chunk-20] | |……………………………… | 70% | |………………………………. | 73% [unnamed-chunk-21] | |………………………………… | 76% | |…………………………………. | 79% [unnamed-chunk-22] | |…………………………………… | 82% | |……………………………………. | 85% [unnamed-chunk-23] | |……………………………………… | 88% | |………………………………………. | 91% [unnamed-chunk-24] | |………………………………………… | 94% | |…………………………………………. | 97% [unnamed-chunk-25] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'ensembl-compara') {
dt = dt[dt$homology_species == plantName2, ]
# print(head(dt))
dt = dt[, c(1,2,6,7,10)]
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName4, ]
# print(head(dt))
dt = dt[, c(2,1, 4,3, 5)]
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 1] = NA
dt[, 3] = NA
df = rbind(df, dt)
} else if (us == 'MCScanX') {
# dt = dt[grepl('stu', dt$to_plant), ]
dt = dt[grepl(plantName3, dt$to_plant), ] # change names
# print(head(dt))
dt = dt[, c(2,1, 4,3, 6)]
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 1] = NA
dt[, 3] = NA
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$orthologous_species == plantName1, ]
# print(head(dt))
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 2] = NA
dt[, 4] = NA
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName1, ]
# print(head(dt))
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 2] = NA
dt[, 4] = NA
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName1, ]
# print(head(dt))
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 2] = NA
dt[, 4] = NA
df = rbind(df, dt)
} else print ('ERROR: Unknown source')
}## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
##
## ensembl-compara FastOMA MCScanX OrthoDB PLAZA
## 22396 79898 60472 56069 33104
## RBH
## 37682
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_protID to_geneID to_protID source
## <chr> <chr> <chr> <chr> <chr>
## 1 <NA> AT4G10330.1 <NA> Maldo.hc.v1a1.ch1… FastO…
## 2 <NA> AT4G10330.1 <NA> Maldo.hc.v1a1.ch1… FastO…
## 3 <NA> AT5G44410.1 <NA> g1.t1 FastO…
## 4 <NA> AT5G44440.2 <NA> g1.t1 FastO…
## 5 <NA> AT1G71250.1 <NA> Maldo.hc.v1a1.ch1… MCSca…
## 6 <NA> AT1G71250.1 <NA> Maldo.hc.v1a1.ch1… MCSca…
## 7 <NA> AT2G07655.1 <NA> Maldo.hc.v1a1.sc4… MCSca…
## 8 <NA> AT2G07655.1 <NA> Maldo.hc.v1a1.sc4… MCSca…
## 9 AT2G46320 <NA> Maldo.hc.v1a1.ch1A.g26266 <NA> Ortho…
## 10 AT4G27940 <NA> Maldo.hc.v1a1.ch1A.g26266 <NA> Ortho…
## 11 AT2G07695 <NA> Maldo.hc.v1a1.sc164A.g48922 <NA> Ortho…
## 12 AT2G07695 <NA> Maldo.hc.v1a1.sc119A.g48697 <NA> Ortho…
## 13 AT1G01020 <NA> MD01G1091900 <NA> PLAZA
## 14 AT1G01020 <NA> MD07G1162500 <NA> PLAZA
## 15 AT1G16360 <NA> MD17G1286800 <NA> PLAZA
## 16 AT1G79450 <NA> MD17G1286800 <NA> PLAZA
## 17 AT1G01020 <NA> Maldo.hc.v1a1.ch7A.g41990 <NA> RBH
## 18 AT1G01030 <NA> Maldo.hc.v1a1.ch1A.g25187 <NA> RBH
## 19 ATMG01410 <NA> Maldo.hc.v1a1.sc36A.g49760 <NA> RBH
## 20 ATMG01410 <NA> Maldo.hc.v1a1.sc71A.g49908 <NA> RBH
## 21 AT1G01020 AT1G01020.1 MD01G0070900 mRNA:MD01G0070900 ensem…
## 22 AT1G01050 AT1G01050.1 MD07G0133100 mRNA:MD07G0133100 ensem…
## 23 AT5G67630 AT5G67630.1 MD02G0158200 mRNA:MD02G0158200 ensem…
## 24 AT5G67630 AT5G67630.1 MD15G0258100 mRNA:MD15G0258100 ensem…
ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])
# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)##
## MCScanX
## 12
## from_geneID from_protID to_geneID to_protID
## <char> <char> <char> <char>
## 1: AT1G25470.uORF1 AT1G25470.uORF1 <NA> Maldo.hc.v1a1.ch13A.g09400.t1
## 2: AT1G68550.uORF1 AT1G68550.uORF1 <NA> Maldo.hc.v1a1.ch13A.g09400.t1
## 3: AT4G30960.uORF1 AT4G30960.uORF1 <NA> Maldo.hc.v1a1.ch15A.g15777.t1
## 4: AT1G58120.uORF1 AT1G58120.uORF1 <NA> Maldo.hc.v1a1.ch15A.g18095.t1
## 5: AT1G68550.uORF1 AT1G68550.uORF1 <NA> Maldo.hc.v1a1.ch16A.g19057.t1
## 6: AT4G25670.uORF1 AT4G25670.uORF1 <NA> Maldo.hc.v1a1.ch1A.g26199.t1
## 7: AT4G25690.uORF1 AT4G25690.uORF1 <NA> Maldo.hc.v1a1.ch1A.g26199.t1
## 8: AT5G52550.uORF1 AT5G52550.uORF1 <NA> Maldo.hc.v1a1.ch1A.g26199.t1
## 9: AT4G30960.uORF1 AT4G30960.uORF1 <NA> Maldo.hc.v1a1.ch2A.g26596.t1
## 10: AT4G25670.uORF1 AT4G25670.uORF1 <NA> Maldo.hc.v1a1.ch7A.g43057.t1
## 11: AT4G25690.uORF1 AT4G25690.uORF1 <NA> Maldo.hc.v1a1.ch7A.g43057.t1
## 12: AT5G52550.uORF1 AT5G52550.uORF1 <NA> Maldo.hc.v1a1.ch7A.g43057.t1
## source
## <char>
## 1: MCScanX
## 2: MCScanX
## 3: MCScanX
## 4: MCScanX
## 5: MCScanX
## 6: MCScanX
## 7: MCScanX
## 8: MCScanX
## 9: MCScanX
## 10: MCScanX
## 11: MCScanX
## 12: MCScanX
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_protID to_geneID to_protID source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT4G10330 AT4G10330.1 Maldo.hc.v1a1.ch10A.g00003 Maldo.hc.v1a1.ch1… FastO…
## 2 AT4G10330 AT4G10330.1 Maldo.hc.v1a1.ch10A.g00003 Maldo.hc.v1a1.ch1… FastO…
## 3 AT5G44410 AT5G44410.1 g1 g1.t1 FastO…
## 4 AT5G44440 AT5G44440.2 g1 g1.t1 FastO…
## 5 AT1G71250 AT1G71250.1 Maldo.hc.v1a1.ch10A.g00019 Maldo.hc.v1a1.ch1… MCSca…
## 6 AT1G71250 AT1G71250.1 Maldo.hc.v1a1.ch10A.g00019 Maldo.hc.v1a1.ch1… MCSca…
## 7 AT2G07655 AT2G07655.1 Maldo.hc.v1a1.sc45A.g49893 Maldo.hc.v1a1.sc4… MCSca…
## 8 AT2G07655 AT2G07655.1 Maldo.hc.v1a1.sc45A.g49893 Maldo.hc.v1a1.sc4… MCSca…
## 9 AT2G46320 <NA> Maldo.hc.v1a1.ch1A.g26266 <NA> Ortho…
## 10 AT4G27940 <NA> Maldo.hc.v1a1.ch1A.g26266 <NA> Ortho…
## 11 AT2G07695 <NA> Maldo.hc.v1a1.sc164A.g48922 <NA> Ortho…
## 12 AT2G07695 <NA> Maldo.hc.v1a1.sc119A.g48697 <NA> Ortho…
## 13 AT1G01020 <NA> MD01G1091900 <NA> PLAZA
## 14 AT1G01020 <NA> MD07G1162500 <NA> PLAZA
## 15 AT1G16360 <NA> MD17G1286800 <NA> PLAZA
## 16 AT1G79450 <NA> MD17G1286800 <NA> PLAZA
## 17 AT1G01020 <NA> Maldo.hc.v1a1.ch7A.g41990 <NA> RBH
## 18 AT1G01030 <NA> Maldo.hc.v1a1.ch1A.g25187 <NA> RBH
## 19 ATMG01410 <NA> Maldo.hc.v1a1.sc36A.g49760 <NA> RBH
## 20 ATMG01410 <NA> Maldo.hc.v1a1.sc71A.g49908 <NA> RBH
## 21 AT1G01020 AT1G01020.1 MD01G0070900 mRNA:MD01G0070900 ensem…
## 22 AT1G01050 AT1G01050.1 MD07G0133100 mRNA:MD07G0133100 ensem…
## 23 AT5G67630 AT5G67630.1 MD02G0158200 mRNA:MD02G0158200 ensem…
## 24 AT5G67630 AT5G67630.1 MD15G0258100 mRNA:MD15G0258100 ensem…
summary_na = df[, .(
na_to_geneID = sum(is.na(to_geneID)),
na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)## source na_to_geneID na_to_protID
## <char> <int> <int>
## 1: ensembl-compara 0 0
## 2: FastOMA 0 0
## 3: MCScanX 0 0
## 4: OrthoDB 0 56069
## 5: PLAZA 0 33104
## 6: RBH 0 37682
here we have some loses because genes between versions do not translate well!
fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
fl = list.files(fp)
fn = fl[grep('Compara_', fl)] # change filename
compara = data.table::fread(file.path(fp, fn))
fn = fl[grep('PLAZA_', fl)] # change filename
if (length(fn) != 0) {
plaza = data.table::fread(file.path(fp, fn))
} else {
plaza = data.frame(matrix(ncol = 4, nrow = 0))
}
compara = compara[compara$Species == ref_genome, ] # change name
plaza = plaza[plaza$Species == ref_genome, ] # change name
colnames(compara)[3] = colnames(plaza)[3] = 'source'
compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
result = compara[, {
# Cartesian join of OrthoDB_list and Orthologs_list for this row
pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
setnames(pairs, c("OrthoDB_ID", "Ortholog"))
pairs
}, by = seq_len(nrow(compara))]
compara = result[, seq_len := NULL]
# compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1,
sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
compara = compara[!duplicated(compara), ]
head(compara)## OrthoDB_ID Ortholog
## <char> <char>
## 1: MD07G0132900 Maldo.hc.v1a1.ch16A.g19172.t1
## 2: MD07G0132900 Maldo.hc.v1a1.ch11A.g06055.t1
## 3: MD07G0132900 Maldo.hc.v1a1.ch11A.g06055.t2
## 4: MD07G0132900 Maldo.hc.v1a1.ch8A.g43704.t1
## 5: MD07G0132900 Maldo.hc.v1a1.ch14A.g12608.t1
## 6: MD07G0132900 Maldo.hc.v1a1.ch4A.g33125.t1
if (nrow(plaza) != 0) {
plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
result = plaza[, {
# Cartesian join of OrthoDB_list and Orthologs_list for this row
pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
setnames(pairs, c("OrthoDB_ID", "Ortholog"))
pairs
}, by = seq_len(nrow(plaza))]
plaza = result[, seq_len := NULL]
# plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
plaza = plaza[!duplicated(plaza), ]
head(plaza)
}## OrthoDB_ID Ortholog
## <char> <char>
## 1: MD15G1256000 Maldo.hc.v1a1.ch16A.g19172.t1
## 2: MD15G1256000 Maldo.hc.v1a1.ch11A.g06055.t1
## 3: MD15G1256000 Maldo.hc.v1a1.ch11A.g06055.t2
## 4: MD15G1256000 Maldo.hc.v1a1.ch8A.g43704.t1
## 5: MD15G1256000 Maldo.hc.v1a1.ch14A.g12608.t1
## 6: MD15G1256000 Maldo.hc.v1a1.ch4A.g33125.t1
if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog)
if (flag2 == 1) { # geneID and prot ID are completely different # make flags
df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
dplyr::mutate(to_geneID = Ortholog) %>%
dplyr::select(-Ortholog)
} else {
df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
dplyr::mutate(to_geneID = Ortholog) %>%
dplyr::select(-Ortholog)
}
df_compara = df_compara[!is.na(df_compara$to_geneID), ]
if (nrow(plaza) != 0) {
df_plaza = dplyr::filter(df, source == "PLAZA") %>%
dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
dplyr::mutate(to_geneID = Ortholog) %>%
dplyr::select(-Ortholog)
df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
}
if (nrow(plaza) != 0) {
df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))
dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
} else {
df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
dt = dplyr::bind_rows(df_compara, df_other)
}
ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
df = dt[, ..ind]
df = df[!duplicated(df), ]
if (nrow(plaza) != 0) {
ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
} else {
ind = which(df$source %in% c('ensembl-compara'))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
}
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 3
## from_geneID to_geneID source
## <chr> <chr> <chr>
## 1 AT4G10330 Maldo.hc.v1a1.ch10A.g00003 FastOMA
## 2 AT4G10340 Maldo.hc.v1a1.ch10A.g00004 FastOMA
## 3 AT5G44410 g1 FastOMA
## 4 AT5G44440 g1 FastOMA
## 5 AT1G71250 Maldo.hc.v1a1.ch10A.g00019 MCScanX
## 6 AT4G10170 Maldo.hc.v1a1.ch10A.g00024 MCScanX
## 7 ATMG01320 Maldo.hc.v1a1.sc45A.g49892 MCScanX
## 8 AT2G07655 Maldo.hc.v1a1.sc45A.g49893 MCScanX
## 9 AT2G46320 Maldo.hc.v1a1.ch1A.g26266 OrthoDB
## 10 AT4G27940 Maldo.hc.v1a1.ch1A.g26266 OrthoDB
## 11 AT2G07695 Maldo.hc.v1a1.sc164A.g48922 OrthoDB
## 12 AT2G07695 Maldo.hc.v1a1.sc119A.g48697 OrthoDB
## 13 AT1G01020 Maldo.hc.v1a1.ch1A.g25188 PLAZA
## 14 AT1G01020 Maldo.hc.v1a1.ch7A.g41989 PLAZA
## 15 AT1G16360 Maldo.hc.v1a1.ch17A.g24061 PLAZA
## 16 AT1G79450 Maldo.hc.v1a1.ch17A.g24061 PLAZA
## 17 AT1G01020 Maldo.hc.v1a1.ch7A.g41990 RBH
## 18 AT1G01030 Maldo.hc.v1a1.ch1A.g25187 RBH
## 19 ATMG01410 Maldo.hc.v1a1.sc36A.g49760 RBH
## 20 ATMG01410 Maldo.hc.v1a1.sc71A.g49908 RBH
## 21 AT1G01020 Maldo.hc.v1a1.ch1A.g25188 ensembl-compara
## 22 AT1G01050 Maldo.hc.v1a1.ch1A.g25184 ensembl-compara
## 23 AT5G67630 Maldo.hc.v1a1.ch2A.g27934 ensembl-compara
## 24 AT5G67630 Maldo.hc.v1a1.ch15A.g17150 ensembl-compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName1",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag1", "flag2")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1091349 58.3 2079652 111.1 2079652 111.1
## Vcells 4849864 37.1 16015626 122.2 16007166 122.2
## [1] 23053
## [1] 34410
##
## ensembl-compara FastOMA MCScanX OrthoDB PLAZA
## 20997 77128 32045 56069 30068
## RBH
## 37682
if (flag1 == 1) {
source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods")
# Print or save the plot
print(upset_plot)>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
## Key: <from_geneID, to_geneID>
## from_geneID to_geneID FastOMA MCScanX OrthoDB PLAZA
## <char> <char> <lgcl> <lgcl> <lgcl> <lgcl>
## 1: AT1G25470.uORF1 Maldo.hc.v1a1.ch13A.g09400 FALSE TRUE FALSE FALSE
## 2: AT1G58120.uORF1 Maldo.hc.v1a1.ch15A.g18095 FALSE TRUE FALSE FALSE
## 3: AT1G68550.uORF1 Maldo.hc.v1a1.ch13A.g09400 FALSE TRUE FALSE FALSE
## 4: AT1G68550.uORF1 Maldo.hc.v1a1.ch16A.g19057 FALSE TRUE FALSE FALSE
## 5: AT4G25670.uORF1 Maldo.hc.v1a1.ch1A.g26199 FALSE TRUE FALSE FALSE
## 6: AT4G25670.uORF1 Maldo.hc.v1a1.ch7A.g43057 FALSE TRUE FALSE FALSE
## 7: AT4G25690.uORF1 Maldo.hc.v1a1.ch1A.g26199 FALSE TRUE FALSE FALSE
## 8: AT4G25690.uORF1 Maldo.hc.v1a1.ch7A.g43057 FALSE TRUE FALSE FALSE
## 9: AT4G30960.uORF1 Maldo.hc.v1a1.ch15A.g15777 FALSE TRUE FALSE FALSE
## 10: AT4G30960.uORF1 Maldo.hc.v1a1.ch2A.g26596 FALSE TRUE FALSE FALSE
## 11: AT5G52550.uORF1 Maldo.hc.v1a1.ch1A.g26199 FALSE TRUE FALSE FALSE
## 12: AT5G52550.uORF1 Maldo.hc.v1a1.ch7A.g43057 FALSE TRUE FALSE FALSE
## RBH ensembl-compara count_evidence
## <lgcl> <lgcl> <num>
## 1: FALSE FALSE 1
## 2: FALSE FALSE 1
## 3: FALSE FALSE 1
## 4: FALSE FALSE 1
## 5: FALSE FALSE 1
## 6: FALSE FALSE 1
## 7: FALSE FALSE 1
## 8: FALSE FALSE 1
## 9: FALSE FALSE 1
## 10: FALSE FALSE 1
## 11: FALSE FALSE 1
## 12: FALSE FALSE 1
# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]
combined = gmm[, .(
BINCODE = paste(unique(BINCODE), collapse = " | "),
NAME = paste(unique(NAME), collapse = " | "),
DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]
charToRaw(combined$IDENTIFIER[1])## [1] 27 6d 61 6c 64 6f 2e 68 63 2e 76 31 61 31 2e 63 68 33 61 2e 67 33 31 32 39
## [26] 31 2e 74 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE) # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE) # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE) # change as needed
charToRaw(combined$IDENTIFIER[1])## [1] 6d 61 6c 64 6f 2e 68 63 2e 76 31 61 31 2e 63 68 33 61 2e 67 33 31 32 39 31
## [26] 2e 74 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2)) # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)##
## FALSE TRUE
## 14272 35833
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "ensembl-compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "athName" "athSynonims"
## [17] "all_pathways" "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE TRUE
## 122701 27
## [1] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
## [16] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName1",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag1", "flag2")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 6626957 354.0 11756942 627.9 11756942 627.9
## Vcells 112261222 856.5 190630912 1454.4 158751041 1211.2
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 23046
## [1] 34408
## [1] 1 128
## [1] 1 116
## [1] 319
## [1] 288
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag1 == 1) {
methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) { # make flags
methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 95834 16984 4836
##
## 100% match no match partial match
## 1 47777 14508 4314
## 2 13486 1598 273
## 3 8903 517 98
## 4 8949 203 63
## 5 10121 123 56
## 6 6598 35 32
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
# Initialize an empty set or list covered_genes.
#
# For each method in the list:
# ["MCScanX", "ensembl-compara", "PLAZA", "OrthoDB", "RBH", "FastOMA"]
#
# For every row in the data table dt:
#
# a. Check if filter_criteria is "reject" for the row.
#
# b. Check if the value of the column corresponding to method in this row is TRUE.
#
# c. Check if from_geneID in this row is not in covered_genes.
#
# d. If method is one of ["OrthoDB", "RBH", "FastOMA"]:
# Check if gene pair is covered by all three methods:
# If yes:
# is_candidate = TRUE
# new_criteria = "OrthoDB_FastOMA_RBH"
# Else if gene pair is covered by any two of those three methods:
# If yes:
# is_candidate = TRUE
# new_criteria = join the two method names with underscore, e.g. "OrthoDB_FastOMA" or "RBH_FastOMA" or "OrthoDB_RBH"
# Else if MapMan4_Match string contains "match based on" and current method name:
# is_candidate = TRUE
# new_criteria = {method}_MapMan4 (e.g., "RBH_MapMan4")
# Else:
# is_candidate = FALSE
special_methods = c("OrthoDB", "RBH", "FastOMA")
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## OrthoDB_MapMan4 RBH_MapMan4 FastOMA_MapMan4
## 5183 1882 4015
## #### #### #### ####
##
## ensembl-compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 5281 4015 1242 1072
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 32088 579 5183 670
## PLAZA RBH_MapMan4 reject
## 6577 1882 59065
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 53789 3695 1105
##
## 100% match no match partial match
## 1 14991 1733 738
## 2 7682 1198 145
## 3 6646 416 80
## 4 8004 191 59
## 5 9868 122 51
## 6 6598 35 32
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag1 == 1) {
source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) { # make flags
source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 21121
## [1] 33249
## [1] 1 93
## [1] 1 96
## [1] 32
## [1] 28
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## ensembl-compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 163 83 71 35
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 1337 26 122 30
## PLAZA RBH_MapMan4 reject
## 264 29 1788
openxlsx::write.xlsx(pss_long,
paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName1 = 'ppe', # change name - PLAZA, OrthoDB, RBH
plantName2 = 'prunus_persica', # change name - compara # sources
plantName3 = 'ppe', # change name - MCScanX # sources
plantName4 = 'pper', # change name - FastOMA # sources
plantDirIn = "ppe_peach", # inconsistent-IDs, orthofinder
plantNameOut = "peach",
plantDirOut = file.path('..', 'reports', 'fruitTrees', "peach"),
pattern_in = "\\.[^.]*$", # everythin after the last dot
pattern_out = "", # all-IDs
compara_pattern_in1 = "",
compara_pattern_out1 = "",
compara_pattern_in2 = "",
compara_pattern_out2 = "",
plaza_pattern_in1 = "",
plaza_pattern_in2 = "",
ref_genome = "PLAZA_proteome.selected_transcript.ppe", # inconsistent-IDs, orthofinder for OrthoDB
mercator = 'pper_Mercator4v7_results.txt', # plant-gmm
mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']", # plant-gmm, generic removal of nonsence
mercatorPatternOut1 = "", # plant-gmm
mercatorPatternIn2 = "g", # plant-gmm
mercatorPatternOut2 = "G", # plant-gmm
flag1 = 1,
flag2 = 2,
flag3 = FALSE
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x0000024f4761edd0>
##
##
## processing file: ./08_fruitTrees-child1.rmd
| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-44] | |….. | 9% | |…… | 12% [unnamed-chunk-45] | |…….. | 15% | |……… | 18% [unnamed-chunk-46] | |……….. | 21% | |………… | 24% [unnamed-chunk-47] | |………….. | 27% | |…………… | 30% [unnamed-chunk-48] | |…………….. | 33% | |………………. | 36% [unnamed-chunk-49] | |……………….. | 39% | |…………………. | 42% [unnamed-chunk-50] | |………………….. | 45% | |……………………. | 48% [unnamed-chunk-51] | |…………………….. | 52% | |………………………. | 55% [unnamed-chunk-52] | |……………………….. | 58% | |…………………………. | 61% [unnamed-chunk-53] | |………………………….. | 64% | |……………………………. | 67% [unnamed-chunk-54] | |……………………………… | 70% | |………………………………. | 73% [unnamed-chunk-55] | |………………………………… | 76% | |…………………………………. | 79% [unnamed-chunk-56] | |…………………………………… | 82% | |……………………………………. | 85% [unnamed-chunk-57] | |……………………………………… | 88% | |………………………………………. | 91% [unnamed-chunk-58] | |………………………………………… | 94% | |…………………………………………. | 97% [unnamed-chunk-59] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'ensembl-compara') {
dt = dt[dt$homology_species == plantName2, ]
# print(head(dt))
dt = dt[, c(1,2,6,7,10)]
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName4, ]
# print(head(dt))
dt = dt[, c(2,1, 4,3, 5)]
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 1] = NA
dt[, 3] = NA
df = rbind(df, dt)
} else if (us == 'MCScanX') {
# dt = dt[grepl('stu', dt$to_plant), ]
dt = dt[grepl(plantName3, dt$to_plant), ] # change names
# print(head(dt))
dt = dt[, c(2,1, 4,3, 6)]
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 1] = NA
dt[, 3] = NA
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$orthologous_species == plantName1, ]
# print(head(dt))
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 2] = NA
dt[, 4] = NA
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName1, ]
# print(head(dt))
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 2] = NA
dt[, 4] = NA
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName1, ]
# print(head(dt))
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 2] = NA
dt[, 4] = NA
df = rbind(df, dt)
} else print ('ERROR: Unknown source')
}## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
##
## ensembl-compara FastOMA MCScanX OrthoDB PLAZA
## 16851 44006 62733 38370 20774
## RBH
## 24564
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_protID to_geneID to_protID source
## <chr> <chr> <chr> <chr> <chr>
## 1 <NA> AT1G12040.1 <NA> Prupe.1G000500.1 FastOMA
## 2 <NA> AT1G62440.1 <NA> Prupe.1G000500.1 FastOMA
## 3 <NA> AT1G61010.3 <NA> Prupe.I006100.1 FastOMA
## 4 <NA> AT1G61010.3 <NA> Prupe.I006200.1 FastOMA
## 5 <NA> AT5G58130.1 <NA> Prupe.1G000700.1 MCScanX
## 6 <NA> AT5G58110.1 <NA> Prupe.1G000800.1 MCScanX
## 7 <NA> AT4G02060.1 <NA> Prupe.8G272000.1 MCScanX
## 8 <NA> AT4G02060.2 <NA> Prupe.8G272000.1 MCScanX
## 9 AT3G17900 <NA> Prupe.1G267800 <NA> OrthoDB
## 10 AT4G35230 <NA> Prupe.1G355500 <NA> OrthoDB
## 11 AT2G07675 <NA> Prupe.6G146800 <NA> OrthoDB
## 12 ATMG00980 <NA> Prupe.6G146800 <NA> OrthoDB
## 13 AT1G01020 <NA> Prupe.2G201100 <NA> PLAZA
## 14 AT1G01050 <NA> Prupe.2G200700 <NA> PLAZA
## 15 AT1G52360 <NA> Prupe.I003200 <NA> PLAZA
## 16 AT5G40850 <NA> Prupe.I005100 <NA> PLAZA
## 17 AT1G01030 <NA> Prupe.5G134900 <NA> RBH
## 18 AT1G01040 <NA> Prupe.2G200900 <NA> RBH
## 19 ATMG01250 <NA> Prupe.6G123900 <NA> RBH
## 20 ATMG01250 <NA> Prupe.7G164000 <NA> RBH
## 21 AT1G01020 AT1G01020.1 PRUPE_2G201100 ONI23664 ensembl-compara
## 22 AT1G01040 AT1G01040.2 PRUPE_2G200900 ONI23660 ensembl-compara
## 23 AT5G67620 AT5G67620.1 PRUPE_6G219300 ONI02738 ensembl-compara
## 24 AT5G67630 AT5G67630.1 PRUPE_1G544700 ONI35595 ensembl-compara
ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])
# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)##
## MCScanX
## 23
## from_geneID from_protID to_geneID to_protID source
## <char> <char> <char> <char> <char>
## 1: AT3G25570.uORF1 AT3G25570.uORF1 <NA> Prupe.1G299600.1 MCScanX
## 2: AT1G25470.uORF1 AT1G25470.uORF1 <NA> Prupe.1G310000.1 MCScanX
## 3: AT1G68550.uORF1 AT1G68550.uORF1 <NA> Prupe.1G310000.1 MCScanX
## 4: AT1G23150.uORF1 AT1G23150.uORF1 <NA> Prupe.1G329400.1 MCScanX
## 5: AT1G70780.uORF1 AT1G70780.uORF1 <NA> Prupe.1G329400.1 MCScanX
## 6: AT1G75390.uORF1 AT1G75390.uORF1 <NA> Prupe.1G374500.1 MCScanX
## 7: AT5G50010.uORF2 AT5G50010.uORF2 <NA> Prupe.1G527700.1 MCScanX
## 8: AT4G25670.uORF1 AT4G25670.uORF1 <NA> Prupe.2G300500.1 MCScanX
## 9: AT4G25690.uORF1 AT4G25690.uORF1 <NA> Prupe.2G300500.1 MCScanX
## 10: AT5G52550.uORF1 AT5G52550.uORF1 <NA> Prupe.2G300500.1 MCScanX
## 11: AT5G53590.uORF1 AT5G53590.uORF1 <NA> Prupe.2G317000.1 MCScanX
## 12: AT3G02470.uORF1 AT3G02470.uORF1 <NA> Prupe.3G243800.1 MCScanX
## 13: AT5G15950.uORF1 AT5G15950.uORF1 <NA> Prupe.3G243800.1 MCScanX
## 14: AT1G29950.uORF2 AT1G29950.uORF2 <NA> Prupe.4G077000.1 MCScanX
## 15: AT4G19110.uORF1 AT4G19110.uORF1 <NA> Prupe.5G021200.1 MCScanX
## 16: AT5G45430.uORF1 AT5G45430.uORF1 <NA> Prupe.5G021200.1 MCScanX
## 17: AT2G27230.uORF1 AT2G27230.uORF1 <NA> Prupe.6G144400.1 MCScanX
## 18: AT3G12010.uORF1 AT3G12010.uORF1 <NA> Prupe.7G082600.1 MCScanX
## 19: AT4G36990.uORF1 AT4G36990.uORF1 <NA> Prupe.7G133500.1 MCScanX
## 20: AT2G18160.uORF1 AT2G18160.uORF1 <NA> Prupe.7G160500.1 MCScanX
## 21: AT5G09460.uORF1 AT5G09460.uORF1 <NA> Prupe.8G067700.1 MCScanX
## 22: AT5G64340.uORF1 AT5G64340.uORF1 <NA> Prupe.8G067700.1 MCScanX
## 23: AT4G34590.uORF1 AT4G34590.uORF1 <NA> Prupe.8G091700.1 MCScanX
## from_geneID from_protID to_geneID to_protID source
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_protID to_geneID to_protID source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 AT1G12040.1 Prupe.1G000500 Prupe.1G000500.1 FastOMA
## 2 AT1G62440 AT1G62440.1 Prupe.1G000500 Prupe.1G000500.1 FastOMA
## 3 AT1G61010 AT1G61010.3 Prupe.I006100 Prupe.I006100.1 FastOMA
## 4 AT1G61010 AT1G61010.3 Prupe.I006200 Prupe.I006200.1 FastOMA
## 5 AT5G58130 AT5G58130.1 Prupe.1G000700 Prupe.1G000700.1 MCScanX
## 6 AT5G58110 AT5G58110.1 Prupe.1G000800 Prupe.1G000800.1 MCScanX
## 7 AT4G02060 AT4G02060.1 Prupe.8G272000 Prupe.8G272000.1 MCScanX
## 8 AT4G02060 AT4G02060.2 Prupe.8G272000 Prupe.8G272000.1 MCScanX
## 9 AT3G17900 <NA> Prupe.1G267800 <NA> OrthoDB
## 10 AT4G35230 <NA> Prupe.1G355500 <NA> OrthoDB
## 11 AT2G07675 <NA> Prupe.6G146800 <NA> OrthoDB
## 12 ATMG00980 <NA> Prupe.6G146800 <NA> OrthoDB
## 13 AT1G01020 <NA> Prupe.2G201100 <NA> PLAZA
## 14 AT1G01050 <NA> Prupe.2G200700 <NA> PLAZA
## 15 AT1G52360 <NA> Prupe.I003200 <NA> PLAZA
## 16 AT5G40850 <NA> Prupe.I005100 <NA> PLAZA
## 17 AT1G01030 <NA> Prupe.5G134900 <NA> RBH
## 18 AT1G01040 <NA> Prupe.2G200900 <NA> RBH
## 19 ATMG01250 <NA> Prupe.6G123900 <NA> RBH
## 20 ATMG01250 <NA> Prupe.7G164000 <NA> RBH
## 21 AT1G01020 AT1G01020.1 PRUPE_2G201100 ONI23664 ensembl-compara
## 22 AT1G01040 AT1G01040.2 PRUPE_2G200900 ONI23660 ensembl-compara
## 23 AT5G67620 AT5G67620.1 PRUPE_6G219300 ONI02738 ensembl-compara
## 24 AT5G67630 AT5G67630.1 PRUPE_1G544700 ONI35595 ensembl-compara
summary_na = df[, .(
na_to_geneID = sum(is.na(to_geneID)),
na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)## source na_to_geneID na_to_protID
## <char> <int> <int>
## 1: ensembl-compara 0 0
## 2: FastOMA 0 0
## 3: MCScanX 0 0
## 4: OrthoDB 0 38370
## 5: PLAZA 0 20774
## 6: RBH 0 24564
here we have some loses because genes between versions do not translate well!
fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
fl = list.files(fp)
fn = fl[grep('Compara_', fl)] # change filename
compara = data.table::fread(file.path(fp, fn))
fn = fl[grep('PLAZA_', fl)] # change filename
if (length(fn) != 0) {
plaza = data.table::fread(file.path(fp, fn))
} else {
plaza = data.frame(matrix(ncol = 4, nrow = 0))
}
compara = compara[compara$Species == ref_genome, ] # change name
plaza = plaza[plaza$Species == ref_genome, ] # change name
colnames(compara)[3] = colnames(plaza)[3] = 'source'
compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
result = compara[, {
# Cartesian join of OrthoDB_list and Orthologs_list for this row
pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
setnames(pairs, c("OrthoDB_ID", "Ortholog"))
pairs
}, by = seq_len(nrow(compara))]
compara = result[, seq_len := NULL]
# compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1,
sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
compara = compara[!duplicated(compara), ]
head(compara)## OrthoDB_ID Ortholog
## <char> <char>
## 1: ONH90473 Prupe.8G055800.1
## 2: ONI29575 Prupe.1G202700.1
## 3: ONH90342 Prupe.8G047800.1
## 4: ONH90445 Prupe.8G054500.1
## 5: ONI26493 Prupe.1G028600.3
## 6: ONI21208 Prupe.2G053400.1
if (nrow(plaza) != 0) {
plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
result = plaza[, {
# Cartesian join of OrthoDB_list and Orthologs_list for this row
pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
setnames(pairs, c("OrthoDB_ID", "Ortholog"))
pairs
}, by = seq_len(nrow(plaza))]
plaza = result[, seq_len := NULL]
# plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
plaza = plaza[!duplicated(plaza), ]
head(plaza)
}
if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog)
if (flag2 == 1) { # geneID and prot ID are completely different # make flags
df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
dplyr::mutate(to_geneID = Ortholog) %>%
dplyr::select(-Ortholog)
} else {
df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
dplyr::mutate(to_geneID = Ortholog) %>%
dplyr::select(-Ortholog)
}
df_compara = df_compara[!is.na(df_compara$to_geneID), ]
if (nrow(plaza) != 0) {
df_plaza = dplyr::filter(df, source == "PLAZA") %>%
dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
dplyr::mutate(to_geneID = Ortholog) %>%
dplyr::select(-Ortholog)
df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
}
if (nrow(plaza) != 0) {
df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))
dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
} else {
df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
dt = dplyr::bind_rows(df_compara, df_other)
}
ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
df = dt[, ..ind]
df = df[!duplicated(df), ]
if (nrow(plaza) != 0) {
ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
} else {
ind = which(df$source %in% c('ensembl-compara'))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
}
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 3
## from_geneID to_geneID source
## <chr> <chr> <chr>
## 1 AT1G12040 Prupe.1G000500 FastOMA
## 2 AT1G62440 Prupe.1G000500 FastOMA
## 3 AT1G61010 Prupe.I006100 FastOMA
## 4 AT1G61010 Prupe.I006200 FastOMA
## 5 AT5G58130 Prupe.1G000700 MCScanX
## 6 AT5G58110 Prupe.1G000800 MCScanX
## 7 AT3G62540 Prupe.8G271400 MCScanX
## 8 AT4G02060 Prupe.8G272000 MCScanX
## 9 AT3G17900 Prupe.1G267800 OrthoDB
## 10 AT4G35230 Prupe.1G355500 OrthoDB
## 11 AT2G07675 Prupe.6G146800 OrthoDB
## 12 ATMG00980 Prupe.6G146800 OrthoDB
## 13 AT1G01020 Prupe.2G201100 PLAZA
## 14 AT1G01050 Prupe.2G200700 PLAZA
## 15 AT1G52360 Prupe.I003200 PLAZA
## 16 AT5G40850 Prupe.I005100 PLAZA
## 17 AT1G01030 Prupe.5G134900 RBH
## 18 AT1G01040 Prupe.2G200900 RBH
## 19 ATMG01250 Prupe.6G123900 RBH
## 20 ATMG01250 Prupe.7G164000 RBH
## 21 AT1G01020 Prupe.2G201100 ensembl-compara
## 22 AT1G01040 Prupe.2G200900 ensembl-compara
## 23 AT5G67620 Prupe.6G219300 ensembl-compara
## 24 AT5G67630 Prupe.1G544700 ensembl-compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName1",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag1", "flag2")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4673117 249.6 9405554 502.4 11756942 627.9
## Vcells 120380685 918.5 190630912 1454.4 190628107 1454.4
## [1] 23136
## [1] 21243
##
## ensembl-compara FastOMA MCScanX OrthoDB PLAZA
## 16193 44006 17894 38370 20774
## RBH
## 24564
if (flag1 == 1) {
source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods")
# Print or save the plot
print(upset_plot)>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
## Key: <from_geneID, to_geneID>
## from_geneID to_geneID FastOMA MCScanX OrthoDB PLAZA RBH
## <char> <char> <lgcl> <lgcl> <lgcl> <lgcl> <lgcl>
## 1: AT1G23150.uORF1 Prupe.1G329400 FALSE TRUE FALSE FALSE FALSE
## 2: AT1G25470.uORF1 Prupe.1G310000 FALSE TRUE FALSE FALSE FALSE
## 3: AT1G29950.uORF2 Prupe.4G077000 FALSE TRUE FALSE FALSE FALSE
## 4: AT1G68550.uORF1 Prupe.1G310000 FALSE TRUE FALSE FALSE FALSE
## 5: AT1G70780.uORF1 Prupe.1G329400 FALSE TRUE FALSE FALSE FALSE
## 6: AT1G75390.uORF1 Prupe.1G374500 FALSE TRUE FALSE FALSE FALSE
## 7: AT2G18160.uORF1 Prupe.7G160500 FALSE TRUE FALSE FALSE FALSE
## 8: AT2G27230.uORF1 Prupe.6G144400 FALSE TRUE FALSE FALSE FALSE
## 9: AT3G02470.uORF1 Prupe.3G243800 FALSE TRUE FALSE FALSE FALSE
## 10: AT3G12010.uORF1 Prupe.7G082600 FALSE TRUE FALSE FALSE FALSE
## 11: AT3G25570.uORF1 Prupe.1G299600 FALSE TRUE FALSE FALSE FALSE
## 12: AT4G19110.uORF1 Prupe.5G021200 FALSE TRUE FALSE FALSE FALSE
## 13: AT4G25670.uORF1 Prupe.2G300500 FALSE TRUE FALSE FALSE FALSE
## 14: AT4G25690.uORF1 Prupe.2G300500 FALSE TRUE FALSE FALSE FALSE
## 15: AT4G34590.uORF1 Prupe.8G091700 FALSE TRUE FALSE FALSE FALSE
## 16: AT4G36990.uORF1 Prupe.7G133500 FALSE TRUE FALSE FALSE FALSE
## 17: AT5G09460.uORF1 Prupe.8G067700 FALSE TRUE FALSE FALSE FALSE
## 18: AT5G15950.uORF1 Prupe.3G243800 FALSE TRUE FALSE FALSE FALSE
## 19: AT5G45430.uORF1 Prupe.5G021200 FALSE TRUE FALSE FALSE FALSE
## 20: AT5G50010.uORF2 Prupe.1G527700 FALSE TRUE FALSE FALSE FALSE
## 21: AT5G52550.uORF1 Prupe.2G300500 FALSE TRUE FALSE FALSE FALSE
## 22: AT5G53590.uORF1 Prupe.2G317000 FALSE TRUE FALSE FALSE FALSE
## 23: AT5G64340.uORF1 Prupe.8G067700 FALSE TRUE FALSE FALSE FALSE
## from_geneID to_geneID FastOMA MCScanX OrthoDB PLAZA RBH
## ensembl-compara count_evidence
## <lgcl> <num>
## 1: FALSE 1
## 2: FALSE 1
## 3: FALSE 1
## 4: FALSE 1
## 5: FALSE 1
## 6: FALSE 1
## 7: FALSE 1
## 8: FALSE 1
## 9: FALSE 1
## 10: FALSE 1
## 11: FALSE 1
## 12: FALSE 1
## 13: FALSE 1
## 14: FALSE 1
## 15: FALSE 1
## 16: FALSE 1
## 17: FALSE 1
## 18: FALSE 1
## 19: FALSE 1
## 20: FALSE 1
## 21: FALSE 1
## 22: FALSE 1
## 23: FALSE 1
## ensembl-compara count_evidence
# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]
combined = gmm[, .(
BINCODE = paste(unique(BINCODE), collapse = " | "),
NAME = paste(unique(NAME), collapse = " | "),
DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]
charToRaw(combined$IDENTIFIER[1])## [1] 27 70 72 75 70 65 2e 33 67 30 30 34 31 30 30 2e 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE) # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE) # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE) # change as needed
charToRaw(combined$IDENTIFIER[1])## [1] 70 72 75 70 65 2e 33 67 30 30 34 31 30 30 2e 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2)) # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)##
## FALSE TRUE
## 5670 21203
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "ensembl-compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "athName" "athSynonims"
## [17] "all_pathways" "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE TRUE
## 71187 231
## [1] "Prupe.I000100" "Prupe.I000200" "Prupe.I000200" "Prupe.I000200"
## [5] "Prupe.I000200" "Prupe.I000200" "Prupe.I000200" "Prupe.I000300"
## [9] "Prupe.I000300" "Prupe.I000300" "Prupe.I000400" "Prupe.I000400"
## [13] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [17] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [21] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [25] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [29] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [33] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [37] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [41] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [45] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [49] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000600"
## [53] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [57] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [61] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [65] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [69] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [73] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [77] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [81] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [85] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [89] "Prupe.I000600" "Prupe.I000600" "Prupe.I000700" "Prupe.I000700"
## [93] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [97] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [101] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [105] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [109] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [113] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [117] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [121] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [125] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [129] "Prupe.I000700" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [133] "Prupe.I000800" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [137] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I000900"
## [141] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I001000"
## [145] "Prupe.I001000" "Prupe.I001000" "Prupe.I001000" "Prupe.I001000"
## [149] "Prupe.I001000" "Prupe.I001000" "Prupe.I001100" "Prupe.I001100"
## [153] "Prupe.I001100" "Prupe.I001600" "Prupe.I001700" "Prupe.I001700"
## [157] "Prupe.I001700" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [161] "Prupe.I001800" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [165] "Prupe.I001800" "Prupe.I001800" "Prupe.I001900" "Prupe.I001900"
## [169] "Prupe.I002100" "Prupe.I002100" "Prupe.I002300" "Prupe.I002300"
## [173] "Prupe.I002300" "Prupe.I002400" "Prupe.I002400" "Prupe.I002600"
## [177] "Prupe.I002600" "Prupe.I002800" "Prupe.I002800" "Prupe.I002900"
## [181] "Prupe.I002900" "Prupe.I003000" "Prupe.I003000" "Prupe.I003100"
## [185] "Prupe.I003100" "Prupe.I003100" "Prupe.I003200" "Prupe.I003200"
## [189] "Prupe.I003200" "Prupe.I003300" "Prupe.I003400" "Prupe.I003400"
## [193] "Prupe.I003400" "Prupe.I003400" "Prupe.I003400" "Prupe.I003400"
## [197] "Prupe.I003400" "Prupe.I003800" "Prupe.I003800" "Prupe.I003900"
## [201] "Prupe.I003900" "Prupe.I004000" "Prupe.I004000" "Prupe.I004400"
## [205] "Prupe.I004400" "Prupe.I004400" "Prupe.I004400" "Prupe.I004400"
## [209] "Prupe.I004400" "Prupe.I004500" "Prupe.I004500" "Prupe.I004500"
## [213] "Prupe.I004500" "Prupe.I004500" "Prupe.I004500" "Prupe.I005000"
## [217] "Prupe.I005000" "Prupe.I005100" "Prupe.I005200" "Prupe.I005200"
## [221] "Prupe.I005200" "Prupe.I005200" "Prupe.I005200" "Prupe.I005500"
## [225] "Prupe.I005500" "Prupe.I005500" "Prupe.I005600" "Prupe.I005700"
## [229] "Prupe.I005800" "Prupe.I006100" "Prupe.I006200"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName1",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag1", "flag2")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4819358 257.4 9405554 502.4 11756942 627.9
## Vcells 77105237 588.3 190630912 1454.4 190628107 1454.4
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 23113
## [1] 21227
## [1] 1 57
## [1] 1 115
## [1] 264
## [1] 135
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag1 == 1) {
methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) { # make flags
methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 60256 8696 2466
##
## 100% match no match partial match
## 1 28995 7390 2167
## 2 8716 797 143
## 3 5356 231 64
## 4 5513 140 40
## 5 6882 93 32
## 6 4794 45 20
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
# Initialize an empty set or list covered_genes.
#
# For each method in the list:
# ["MCScanX", "ensembl-compara", "PLAZA", "OrthoDB", "RBH", "FastOMA"]
#
# For every row in the data table dt:
#
# a. Check if filter_criteria is "reject" for the row.
#
# b. Check if the value of the column corresponding to method in this row is TRUE.
#
# c. Check if from_geneID in this row is not in covered_genes.
#
# d. If method is one of ["OrthoDB", "RBH", "FastOMA"]:
# Check if gene pair is covered by all three methods:
# If yes:
# is_candidate = TRUE
# new_criteria = "OrthoDB_FastOMA_RBH"
# Else if gene pair is covered by any two of those three methods:
# If yes:
# is_candidate = TRUE
# new_criteria = join the two method names with underscore, e.g. "OrthoDB_FastOMA" or "RBH_FastOMA" or "OrthoDB_RBH"
# Else if MapMan4_Match string contains "match based on" and current method name:
# is_candidate = TRUE
# new_criteria = {method}_MapMan4 (e.g., "RBH_MapMan4")
# Else:
# is_candidate = FALSE
special_methods = c("OrthoDB", "RBH", "FastOMA")
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## OrthoDB_MapMan4 RBH_MapMan4 FastOMA_MapMan4
## 3923 1100 1947
## #### #### #### ####
##
## ensembl-compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 5084 1947 1109 231
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 17871 415 3923 516
## PLAZA RBH_MapMan4 reject
## 4912 1100 34310
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 34589 1921 598
##
## 100% match no match partial match
## 1 9932 949 397
## 2 4832 530 62
## 3 3682 179 49
## 4 4769 125 38
## 5 6580 93 32
## 6 4794 45 20
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag1 == 1) {
source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) { # make flags
source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 20588
## [1] 20794
## [1] 1 50
## [1] 1 96
## [1] 7
## [1] 15
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## ensembl-compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 145 38 43 7
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 728 27 51 16
## PLAZA RBH_MapMan4 reject
## 166 19 1154
openxlsx::write.xlsx(pss_long,
paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName1 = 'pdul'
, # change name - PLAZA, OrthoDB, RBH
plantName2 = 'prunus_dulcis'
, # change name - compara # sources
plantName3 = 'almond'
, # change name - MCScanX # sources
plantName4 = 'pdul'
, # change name - FastOMA # sources
plantDirIn = "pdul_almond"
, # inconsistent-IDs, orthofinder
plantNameOut = "almond"
,
plantDirOut = file.path('..', 'reports', 'fruitTrees', "almond")
,
pattern_in = "\\.[^.]*$"
, # everythin after the last dot
pattern_out = ""
, # all-IDs
compara_pattern_in1 = ""
,
compara_pattern_out1 = ""
,
compara_pattern_in2 = ""
,
compara_pattern_out2 = ""
,
plaza_pattern_in1 = ""
,
plaza_pattern_in2 = ""
,
ref_genome = "Texas_F1_protein"
, # inconsistent-IDs, orthofinder for OrthoDB
mercator = 'pdul_Mercator4v7_results.txt'
, # plant-gmm
mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']"
, # plant-gmm, generic removal of nonsence
mercatorPatternOut1 = ""
, # plant-gmm
mercatorPatternIn2 = "([fg])"
, # plant-gmm
mercatorPatternOut2 = "\\U\\1" # plant-gmm
,
flag1 = 2
,
flag2 = 2
,
flag3 = FALSE
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x0000024f76449350>
##
##
## processing file: ./08_fruitTrees-child1.rmd
| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-78] | |….. | 9% | |…… | 12% [unnamed-chunk-79] | |…….. | 15% | |……… | 18% [unnamed-chunk-80] | |……….. | 21% | |………… | 24% [unnamed-chunk-81] | |………….. | 27% | |…………… | 30% [unnamed-chunk-82] | |…………….. | 33% | |………………. | 36% [unnamed-chunk-83] | |……………….. | 39% | |…………………. | 42% [unnamed-chunk-84] | |………………….. | 45% | |……………………. | 48% [unnamed-chunk-85] | |…………………….. | 52% | |………………………. | 55% [unnamed-chunk-86] | |……………………….. | 58% | |…………………………. | 61% [unnamed-chunk-87] | |………………………….. | 64% | |……………………………. | 67% [unnamed-chunk-88] | |……………………………… | 70% | |………………………………. | 73% [unnamed-chunk-89] | |………………………………… | 76% | |…………………………………. | 79% [unnamed-chunk-90] | |…………………………………… | 82% | |……………………………………. | 85% [unnamed-chunk-91] | |……………………………………… | 88% | |………………………………………. | 91% [unnamed-chunk-92] | |………………………………………… | 94% | |…………………………………………. | 97% [unnamed-chunk-93] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'ensembl-compara') {
dt = dt[dt$homology_species == plantName2, ]
# print(head(dt))
dt = dt[, c(1,2,6,7,10)]
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName4, ]
# print(head(dt))
dt = dt[, c(2,1, 4,3, 5)]
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 1] = NA
dt[, 3] = NA
df = rbind(df, dt)
} else if (us == 'MCScanX') {
# dt = dt[grepl('stu', dt$to_plant), ]
dt = dt[grepl(plantName3, dt$to_plant), ] # change names
# print(head(dt))
dt = dt[, c(2,1, 4,3, 6)]
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 1] = NA
dt[, 3] = NA
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$orthologous_species == plantName1, ]
# print(head(dt))
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 2] = NA
dt[, 4] = NA
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName1, ]
# print(head(dt))
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 2] = NA
dt[, 4] = NA
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName1, ]
# print(head(dt))
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 2] = NA
dt[, 4] = NA
df = rbind(df, dt)
} else print ('ERROR: Unknown source')
}## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
##
## ensembl-compara FastOMA MCScanX OrthoDB RBH
## 16421 42927 36681 35734 24829
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 20 × 5
## from_geneID from_protID to_geneID to_protID source
## <chr> <chr> <chr> <chr> <chr>
## 1 <NA> AT2G05620.2 <NA> TexasF1_G1000.1 FastOMA
## 2 <NA> AT3G48880.2 <NA> TexasF1_G10001.1 FastOMA
## 3 <NA> AT3G48890.1 <NA> TexasF1_G9999.1 FastOMA
## 4 <NA> AT5G52240.1 <NA> TexasF1_G9999.1 FastOMA
## 5 <NA> AT5G22060.1 <NA> TexasF1_G100.1 MCScanX
## 6 <NA> AT3G48880.1 <NA> TexasF1_G10000.1 MCScanX
## 7 <NA> AT5G52240.1 <NA> TexasF1_G9999.1 MCScanX
## 8 <NA> AT5G52240.2 <NA> TexasF1_G9999.1 MCScanX
## 9 AT1G23390 <NA> TexasF1_G3359 <NA> OrthoDB
## 10 AT5G19210 <NA> TexasF1_G2060 <NA> OrthoDB
## 11 AT3G51810 <NA> TexasF1_G23162 <NA> OrthoDB
## 12 AT2G28815 <NA> TexasF1_G6420 <NA> OrthoDB
## 13 AT1G01030 <NA> TexasF1_G18833 <NA> RBH
## 14 AT1G01040 <NA> TexasF1_G9057 <NA> RBH
## 15 ATMG00860 <NA> TexasF1_G25095 <NA> RBH
## 16 ATMG01250 <NA> TexasF1_G22012 <NA> RBH
## 17 AT1G01020 AT1G01020.1 Prudul26B025674 VVA35962 ensembl-compara
## 18 AT1G01040 AT1G01040.2 Prudul26B022109 VVA35960 ensembl-compara
## 19 AT5G67620 AT5G67620.1 Prudul26B020268 VVA23378 ensembl-compara
## 20 AT5G67630 AT5G67630.1 Prudul26B028327 VVA20286 ensembl-compara
ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])
# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)##
## MCScanX
## 5
## from_geneID from_protID to_geneID to_protID source
## <char> <char> <char> <char> <char>
## 1: AT4G19110.uORF1 AT4G19110.uORF1 <NA> TexasF1_G17548.1 MCScanX
## 2: AT5G45430.uORF1 AT5G45430.uORF1 <NA> TexasF1_G17548.1 MCScanX
## 3: AT1G48600.uORF1 AT1G48600.uORF1 <NA> TexasF1_G19862.1 MCScanX
## 4: AT1G06150.uORF1 AT1G06150.uORF1 <NA> TexasF1_G29714.1 MCScanX
## 5: AT2G31280.uORF1 AT2G31280.uORF1 <NA> TexasF1_G29714.1 MCScanX
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 20 × 5
## from_geneID from_protID to_geneID to_protID source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT2G05620 AT2G05620.2 TexasF1_G1000 TexasF1_G1000.1 FastOMA
## 2 AT3G48880 AT3G48880.2 TexasF1_G10001 TexasF1_G10001.1 FastOMA
## 3 AT3G48890 AT3G48890.1 TexasF1_G9999 TexasF1_G9999.1 FastOMA
## 4 AT5G52240 AT5G52240.1 TexasF1_G9999 TexasF1_G9999.1 FastOMA
## 5 AT5G22060 AT5G22060.1 TexasF1_G100 TexasF1_G100.1 MCScanX
## 6 AT3G48880 AT3G48880.1 TexasF1_G10000 TexasF1_G10000.1 MCScanX
## 7 AT5G52240 AT5G52240.1 TexasF1_G9999 TexasF1_G9999.1 MCScanX
## 8 AT5G52240 AT5G52240.2 TexasF1_G9999 TexasF1_G9999.1 MCScanX
## 9 AT1G23390 <NA> TexasF1_G3359 <NA> OrthoDB
## 10 AT5G19210 <NA> TexasF1_G2060 <NA> OrthoDB
## 11 AT3G51810 <NA> TexasF1_G23162 <NA> OrthoDB
## 12 AT2G28815 <NA> TexasF1_G6420 <NA> OrthoDB
## 13 AT1G01030 <NA> TexasF1_G18833 <NA> RBH
## 14 AT1G01040 <NA> TexasF1_G9057 <NA> RBH
## 15 ATMG00860 <NA> TexasF1_G25095 <NA> RBH
## 16 ATMG01250 <NA> TexasF1_G22012 <NA> RBH
## 17 AT1G01020 AT1G01020.1 Prudul26B025674 VVA35962 ensembl-compara
## 18 AT1G01040 AT1G01040.2 Prudul26B022109 VVA35960 ensembl-compara
## 19 AT5G67620 AT5G67620.1 Prudul26B020268 VVA23378 ensembl-compara
## 20 AT5G67630 AT5G67630.1 Prudul26B028327 VVA20286 ensembl-compara
summary_na = df[, .(
na_to_geneID = sum(is.na(to_geneID)),
na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)## source na_to_geneID na_to_protID
## <char> <int> <int>
## 1: ensembl-compara 0 0
## 2: FastOMA 0 0
## 3: MCScanX 0 0
## 4: OrthoDB 0 35734
## 5: RBH 0 24829
here we have some loses because genes between versions do not translate well!
fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
fl = list.files(fp)
fn = fl[grep('Compara_', fl)] # change filename
compara = data.table::fread(file.path(fp, fn))
fn = fl[grep('PLAZA_', fl)] # change filename
if (length(fn) != 0) {
plaza = data.table::fread(file.path(fp, fn))
} else {
plaza = data.frame(matrix(ncol = 4, nrow = 0))
}
compara = compara[compara$Species == ref_genome, ] # change name
plaza = plaza[plaza$Species == ref_genome, ] # change name
colnames(compara)[3] = colnames(plaza)[3] = 'source'
compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
result = compara[, {
# Cartesian join of OrthoDB_list and Orthologs_list for this row
pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
setnames(pairs, c("OrthoDB_ID", "Ortholog"))
pairs
}, by = seq_len(nrow(compara))]
compara = result[, seq_len := NULL]
# compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1,
sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
compara = compara[!duplicated(compara), ]
head(compara)## OrthoDB_ID Ortholog
## <char> <char>
## 1: VVA36572 TexasF1_G6946.1
## 2: VVA37509 TexasF1_G1328.1
## 3: VVA37509 TexasF1_G1832.1
## 4: VVA37509 TexasF1_G24981.1
## 5: VVA37509 TexasF1_G6849.1
## 6: VVA37509 TexasF1_G16889.1
if (nrow(plaza) != 0) {
plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
result = plaza[, {
# Cartesian join of OrthoDB_list and Orthologs_list for this row
pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
setnames(pairs, c("OrthoDB_ID", "Ortholog"))
pairs
}, by = seq_len(nrow(plaza))]
plaza = result[, seq_len := NULL]
# plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
plaza = plaza[!duplicated(plaza), ]
head(plaza)
}
if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog)
if (flag2 == 1) { # geneID and prot ID are completely different # make flags
df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
dplyr::mutate(to_geneID = Ortholog) %>%
dplyr::select(-Ortholog)
} else {
df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
dplyr::mutate(to_geneID = Ortholog) %>%
dplyr::select(-Ortholog)
}
df_compara = df_compara[!is.na(df_compara$to_geneID), ]
if (nrow(plaza) != 0) {
df_plaza = dplyr::filter(df, source == "PLAZA") %>%
dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
dplyr::mutate(to_geneID = Ortholog) %>%
dplyr::select(-Ortholog)
df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
}
if (nrow(plaza) != 0) {
df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))
dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
} else {
df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
dt = dplyr::bind_rows(df_compara, df_other)
}
ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
df = dt[, ..ind]
df = df[!duplicated(df), ]
if (nrow(plaza) != 0) {
ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
} else {
ind = which(df$source %in% c('ensembl-compara'))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
}
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 20 × 3
## from_geneID to_geneID source
## <chr> <chr> <chr>
## 1 AT2G05620 TexasF1_G1000 FastOMA
## 2 AT3G48880 TexasF1_G10001 FastOMA
## 3 AT3G48890 TexasF1_G9999 FastOMA
## 4 AT5G52240 TexasF1_G9999 FastOMA
## 5 AT5G22060 TexasF1_G100 MCScanX
## 6 AT3G48880 TexasF1_G10000 MCScanX
## 7 AT3G48890 TexasF1_G9999 MCScanX
## 8 AT5G52240 TexasF1_G9999 MCScanX
## 9 AT1G23390 TexasF1_G3359 OrthoDB
## 10 AT5G19210 TexasF1_G2060 OrthoDB
## 11 AT3G51810 TexasF1_G23162 OrthoDB
## 12 AT2G28815 TexasF1_G6420 OrthoDB
## 13 AT1G01030 TexasF1_G18833 RBH
## 14 AT1G01040 TexasF1_G9057 RBH
## 15 ATMG00860 TexasF1_G25095 RBH
## 16 ATMG01250 TexasF1_G22012 RBH
## 17 AT1G01020 TexasF1_G9059 ensembl-compara
## 18 AT1G01040 TexasF1_G9057 ensembl-compara
## 19 AT5G67630 TexasF1_G6308 ensembl-compara
## 20 AT5G67630 TexasF1_G6731 ensembl-compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName1",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag1", "flag2")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3604227 192.5 9405554 502.4 11756942 627.9
## Vcells 81462298 621.6 190630912 1454.4 190628107 1454.4
## [1] 22456
## [1] 20903
##
## ensembl-compara FastOMA MCScanX OrthoDB RBH
## 15975 42927 20151 35734 24829
if (flag1 == 1) {
source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods")
# Print or save the plot
print(upset_plot)>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
## Key: <from_geneID, to_geneID>
## from_geneID to_geneID FastOMA MCScanX OrthoDB RBH
## <char> <char> <lgcl> <lgcl> <lgcl> <lgcl>
## 1: AT1G06150.uORF1 TexasF1_G29714 FALSE TRUE FALSE FALSE
## 2: AT1G48600.uORF1 TexasF1_G19862 FALSE TRUE FALSE FALSE
## 3: AT2G31280.uORF1 TexasF1_G29714 FALSE TRUE FALSE FALSE
## 4: AT4G19110.uORF1 TexasF1_G17548 FALSE TRUE FALSE FALSE
## 5: AT5G45430.uORF1 TexasF1_G17548 FALSE TRUE FALSE FALSE
## ensembl-compara count_evidence
## <lgcl> <num>
## 1: FALSE 1
## 2: FALSE 1
## 3: FALSE 1
## 4: FALSE 1
## 5: FALSE 1
# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]
combined = gmm[, .(
BINCODE = paste(unique(BINCODE), collapse = " | "),
NAME = paste(unique(NAME), collapse = " | "),
DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]
charToRaw(combined$IDENTIFIER[1])## [1] 27 74 65 78 61 73 66 31 5f 67 31 30 34 30 31 2e 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE) # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE) # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE) # change as needed
charToRaw(combined$IDENTIFIER[1])## [1] 74 65 78 61 73 66 31 5f 67 31 30 34 30 31 2e 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2)) # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)##
## FALSE TRUE
## 8713 20903
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "RBH" "ensembl-compara" "from_count"
## [9] "to_count" "count_evidence" "ath_BINCODE" "ath_NAME"
## [13] "ath_DESCRIPTION" "athName" "athSynonims" "all_pathways"
## [17] "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE
## 67028
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName1",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag1", "flag2")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3140113 167.8 9405554 502.4 11756942 627.9
## Vcells 44057647 336.2 152504730 1163.6 190628107 1454.4
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 22451
## [1] 20902
## [1] 1 150
## [1] 1 113
## [1] 171
## [1] 120
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag1 == 1) {
methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) { # make flags
methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 54247 10072 2709
##
## 100% match no match partial match
## 1 26635 8113 2222
## 2 8151 1113 217
## 3 5577 378 94
## 4 6765 254 89
## 5 7119 214 87
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
# Initialize an empty set or list covered_genes.
#
# For each method in the list:
# ["MCScanX", "ensembl-compara", "PLAZA", "OrthoDB", "RBH", "FastOMA"]
#
# For every row in the data table dt:
#
# a. Check if filter_criteria is "reject" for the row.
#
# b. Check if the value of the column corresponding to method in this row is TRUE.
#
# c. Check if from_geneID in this row is not in covered_genes.
#
# d. If method is one of ["OrthoDB", "RBH", "FastOMA"]:
# Check if gene pair is covered by all three methods:
# If yes:
# is_candidate = TRUE
# new_criteria = "OrthoDB_FastOMA_RBH"
# Else if gene pair is covered by any two of those three methods:
# If yes:
# is_candidate = TRUE
# new_criteria = join the two method names with underscore, e.g. "OrthoDB_FastOMA" or "RBH_FastOMA" or "OrthoDB_RBH"
# Else if MapMan4_Match string contains "match based on" and current method name:
# is_candidate = TRUE
# new_criteria = {method}_MapMan4 (e.g., "RBH_MapMan4")
# Else:
# is_candidate = FALSE
special_methods = c("OrthoDB", "RBH", "FastOMA")
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## OrthoDB_MapMan4 RBH_MapMan4 FastOMA_MapMan4
## 3883 1578 2305
## #### #### #### ####
##
## ensembl-compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 4234 2305 1038 583
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 20146 722 3883 692
## RBH_MapMan4 reject
## 1578 31847
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 31920 2377 884
##
## 100% match no match partial match
## 1 9237 838 497
## 2 4767 764 138
## 3 4396 319 80
## 4 6401 242 82
## 5 7119 214 87
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag1 == 1) {
source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) { # make flags
source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 20087
## [1] 20115
## [1] 1 51
## [1] 1 96
## [1] 5
## [1] 11
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## ensembl-compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 118 50 53 25
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 843 42 66 19
## RBH_MapMan4 reject
## 21 1168
openxlsx::write.xlsx(pss_long,
paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName1 = 'pavi'
, # change name - PLAZA, OrthoDB, RBH
plantName2 = 'prunus_avium'
, # change name - compara # sources
plantName3 = 'wildcherry'
, # change name - MCScanX # sources
plantName4 = 'pavi'
, # change name - FastOMA # sources
plantDirIn = "pavi_wildCherry"
, # inconsistent-IDs, orthofinder
plantNameOut = "wildcherry"
,
plantDirOut = file.path('..', 'reports', 'fruitTrees', "wildcherry")
,
pattern_in = "-[^-]*$"
, # everythin after the last dot
pattern_out = ""
, # all-IDs
compara_pattern_in1 = "^(Pav_(sc|co)\\d+\\.\\d+_g\\d+\\.\\d+\\.(br|mk)).*" # instead of ids has some strange concatenation
,
compara_pattern_out1 = "\\1"
,
compara_pattern_in2 = ''
,
compara_pattern_out2 = ''
,
plaza_pattern_in1 = ""
,
plaza_pattern_in2 = ""
,
ref_genome = "Prunus_avium_Tieton.proteins"
, # inconsistent-IDs, orthofinder for OrthoDB
mercator = 'pavi_Mercator4v7_results.txt'
, # plant-gmm
mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']"
, # plant-gmm, generic removal of nonsence
mercatorPatternOut1 = ""
, # plant-gmm
mercatorPatternIn2 = "Fun"
, # plant-gmm
mercatorPatternOut2 = "FUN" # plant-gmm
,
flag1 = 2
,
flag2 = 1
,
flag3 = TRUE # compara$Ortholog contains mrna space gene
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x0000024f763469d8>
##
##
## processing file: ./08_fruitTrees-child1.rmd
| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-112] | |….. | 9% | |…… | 12% [unnamed-chunk-113] | |…….. | 15% | |……… | 18% [unnamed-chunk-114] | |……….. | 21% | |………… | 24% [unnamed-chunk-115] | |………….. | 27% | |…………… | 30% [unnamed-chunk-116] | |…………….. | 33% | |……………… | 36% [unnamed-chunk-117] | |……………….. | 39% | |………………… | 42% [unnamed-chunk-118] | |………………….. | 45% | |…………………… | 48% [unnamed-chunk-119] | |…………………….. | 52% | |……………………… | 55% [unnamed-chunk-120] | |……………………….. | 58% | |………………………… | 61% [unnamed-chunk-121] | |………………………….. | 64% | |…………………………… | 67% [unnamed-chunk-122] | |…………………………….. | 70% | |……………………………… | 73% [unnamed-chunk-123] | |……………………………….. | 76% | |………………………………… | 79% [unnamed-chunk-124] | |………………………………….. | 82% | |…………………………………… | 85% [unnamed-chunk-125] | |…………………………………….. | 88% | |……………………………………… | 91% [unnamed-chunk-126] | |……………………………………….. | 94% | |………………………………………… | 97% [unnamed-chunk-127] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'ensembl-compara') {
dt = dt[dt$homology_species == plantName2, ]
# print(head(dt))
dt = dt[, c(1,2,6,7,10)]
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName4, ]
# print(head(dt))
dt = dt[, c(2,1, 4,3, 5)]
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 1] = NA
dt[, 3] = NA
df = rbind(df, dt)
} else if (us == 'MCScanX') {
# dt = dt[grepl('stu', dt$to_plant), ]
dt = dt[grepl(plantName3, dt$to_plant), ] # change names
# print(head(dt))
dt = dt[, c(2,1, 4,3, 6)]
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 1] = NA
dt[, 3] = NA
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$orthologous_species == plantName1, ]
# print(head(dt))
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 2] = NA
dt[, 4] = NA
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName1, ]
# print(head(dt))
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 2] = NA
dt[, 4] = NA
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName1, ]
# print(head(dt))
colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
dt[, 2] = NA
dt[, 4] = NA
df = rbind(df, dt)
} else print ('ERROR: Unknown source')
}## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
##
## ensembl-compara FastOMA MCScanX OrthoDB RBH
## 4504 48941 39725 38228 25594
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 20 × 5
## from_geneID from_protID to_geneID to_protID source
## <chr> <chr> <chr> <chr> <chr>
## 1 <NA> AT1G12040.1 <NA> FUN_000050-T1 FastO…
## 2 <NA> AT1G62440.1 <NA> FUN_000050-T1 FastO…
## 3 <NA> AT2G43840.2 <NA> FUN_040335-T1 FastO…
## 4 <NA> AT2G44050.1 <NA> FUN_040336-T1 FastO…
## 5 <NA> AT5G58130.1 <NA> FUN_000052-T1 MCSca…
## 6 <NA> AT5G58130.1 <NA> FUN_000053-T1 MCSca…
## 7 <NA> AT2G43840.2 <NA> FUN_040335-T1 MCSca…
## 8 <NA> AT2G44050.1 <NA> FUN_040336-T1 MCSca…
## 9 AT4G39370 <NA> FUN_020728 <NA> Ortho…
## 10 AT3G06350 <NA> FUN_020749 <NA> Ortho…
## 11 AT4G24220 <NA> FUN_029917 <NA> Ortho…
## 12 AT4G24220 <NA> FUN_029968 <NA> Ortho…
## 13 AT1G01030 <NA> FUN_025493 <NA> RBH
## 14 AT1G01040 <NA> FUN_011748 <NA> RBH
## 15 ATMG01250 <NA> FUN_040221 <NA> RBH
## 16 ATMG01360 <NA> FUN_026804 <NA> RBH
## 17 AT1G01210 AT1G01210.3 Pav_sc0000586.1_g580.1.mk Pav_sc0000586.1_g58… ensem…
## 18 AT1G01225 AT1G01225.1 Pav_sc0000586.1_g550.1.mk Pav_sc0000586.1_g55… ensem…
## 19 ATCG00710 ATCG00710.1 Pav_sc0000216.1_g900.1.mk Pav_sc0000216.1_g90… ensem…
## 20 ATMG00310 ATMG00310.1 Pav_sc0001554.1_g020.1.br Pav_sc0001554.1_g02… ensem…
ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])
# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)##
## MCScanX
## 5
## from_geneID from_protID to_geneID to_protID source
## <char> <char> <char> <char> <char>
## 1: AT4G19110.uORF1 AT4G19110.uORF1 <NA> FUN_023847-T1 MCScanX
## 2: AT5G45430.uORF1 AT5G45430.uORF1 <NA> FUN_023847-T1 MCScanX
## 3: AT5G09460.uORF1 AT5G09460.uORF1 <NA> FUN_028837-T1 MCScanX
## 4: AT5G64340.uORF1 AT5G64340.uORF1 <NA> FUN_028837-T1 MCScanX
## 5: AT1G29950.uORF2 AT1G29950.uORF2 <NA> FUN_032407-T1 MCScanX
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 20 × 5
## from_geneID from_protID to_geneID to_protID source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 AT1G12040.1 FUN_000050 FUN_000050-T1 FastO…
## 2 AT1G62440 AT1G62440.1 FUN_000050 FUN_000050-T1 FastO…
## 3 AT2G43840 AT2G43840.2 FUN_040335 FUN_040335-T1 FastO…
## 4 AT2G44050 AT2G44050.1 FUN_040336 FUN_040336-T1 FastO…
## 5 AT5G58130 AT5G58130.1 FUN_000052 FUN_000052-T1 MCSca…
## 6 AT5G58130 AT5G58130.1 FUN_000053 FUN_000053-T1 MCSca…
## 7 AT2G43840 AT2G43840.2 FUN_040335 FUN_040335-T1 MCSca…
## 8 AT2G44050 AT2G44050.1 FUN_040336 FUN_040336-T1 MCSca…
## 9 AT4G39370 <NA> FUN_020728 <NA> Ortho…
## 10 AT3G06350 <NA> FUN_020749 <NA> Ortho…
## 11 AT4G24220 <NA> FUN_029917 <NA> Ortho…
## 12 AT4G24220 <NA> FUN_029968 <NA> Ortho…
## 13 AT1G01030 <NA> FUN_025493 <NA> RBH
## 14 AT1G01040 <NA> FUN_011748 <NA> RBH
## 15 ATMG01250 <NA> FUN_040221 <NA> RBH
## 16 ATMG01360 <NA> FUN_026804 <NA> RBH
## 17 AT1G01210 AT1G01210.3 Pav_sc0000586.1_g580.1.mk Pav_sc0000586.1_g58… ensem…
## 18 AT1G01225 AT1G01225.1 Pav_sc0000586.1_g550.1.mk Pav_sc0000586.1_g55… ensem…
## 19 ATCG00710 ATCG00710.1 Pav_sc0000216.1_g900.1.mk Pav_sc0000216.1_g90… ensem…
## 20 ATMG00310 ATMG00310.1 Pav_sc0001554.1_g020.1.br Pav_sc0001554.1_g02… ensem…
summary_na = df[, .(
na_to_geneID = sum(is.na(to_geneID)),
na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)## source na_to_geneID na_to_protID
## <char> <int> <int>
## 1: ensembl-compara 0 0
## 2: FastOMA 0 0
## 3: MCScanX 0 0
## 4: OrthoDB 0 38228
## 5: RBH 0 25594
here we have some loses because genes between versions do not translate well!
fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
fl = list.files(fp)
fn = fl[grep('Compara_', fl)] # change filename
compara = data.table::fread(file.path(fp, fn))
fn = fl[grep('PLAZA_', fl)] # change filename
if (length(fn) != 0) {
plaza = data.table::fread(file.path(fp, fn))
} else {
plaza = data.frame(matrix(ncol = 4, nrow = 0))
}
compara = compara[compara$Species == ref_genome, ] # change name
plaza = plaza[plaza$Species == ref_genome, ] # change name
colnames(compara)[3] = colnames(plaza)[3] = 'source'
compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
result = compara[, {
# Cartesian join of OrthoDB_list and Orthologs_list for this row
pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
setnames(pairs, c("OrthoDB_ID", "Ortholog"))
pairs
}, by = seq_len(nrow(compara))]
compara = result[, seq_len := NULL]
# compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1,
sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
compara = compara[!duplicated(compara), ]
head(compara)## OrthoDB_ID Ortholog
## <char> <char>
## 1: Pav_sc0007128.1_g050.1.br FUN_003782-T1 FUN_003782
## 2: Pav_sc0011022.1_g010.1.br FUN_040221-T1 FUN_040221
## 3: Pav_sc0001987.1_g160.1.br FUN_040221-T1 FUN_040221
## 4: Pav_sc0001114.1_g300.1.br FUN_040221-T1 FUN_040221
## 5: Pav_co4062149.1_g010.1.br FUN_040221-T1 FUN_040221
## 6: Pav_sc0001576.1_g400.1.br FUN_040221-T1 FUN_040221
if (nrow(plaza) != 0) {
plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
result = plaza[, {
# Cartesian join of OrthoDB_list and Orthologs_list for this row
pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
setnames(pairs, c("OrthoDB_ID", "Ortholog"))
pairs
}, by = seq_len(nrow(plaza))]
plaza = result[, seq_len := NULL]
# plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
plaza = plaza[!duplicated(plaza), ]
head(plaza)
}
if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog)
if (flag2 == 1) { # geneID and prot ID are completely different # make flags
df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
dplyr::mutate(to_geneID = Ortholog) %>%
dplyr::select(-Ortholog)
} else {
df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
dplyr::mutate(to_geneID = Ortholog) %>%
dplyr::select(-Ortholog)
}
df_compara = df_compara[!is.na(df_compara$to_geneID), ]
if (nrow(plaza) != 0) {
df_plaza = dplyr::filter(df, source == "PLAZA") %>%
dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
dplyr::mutate(to_geneID = Ortholog) %>%
dplyr::select(-Ortholog)
df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
}
if (nrow(plaza) != 0) {
df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))
dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
} else {
df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
dt = dplyr::bind_rows(df_compara, df_other)
}
ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
df = dt[, ..ind]
df = df[!duplicated(df), ]
if (nrow(plaza) != 0) {
ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
} else {
ind = which(df$source %in% c('ensembl-compara'))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
}
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 20 × 3
## from_geneID to_geneID source
## <chr> <chr> <chr>
## 1 AT1G12040 FUN_000050 FastOMA
## 2 AT1G62440 FUN_000050 FastOMA
## 3 AT2G43840 FUN_040335 FastOMA
## 4 AT2G44050 FUN_040336 FastOMA
## 5 AT5G58130 FUN_000052 MCScanX
## 6 AT5G58130 FUN_000053 MCScanX
## 7 AT2G43840 FUN_040335 MCScanX
## 8 AT2G44050 FUN_040336 MCScanX
## 9 AT4G39370 FUN_020728 OrthoDB
## 10 AT3G06350 FUN_020749 OrthoDB
## 11 AT4G24220 FUN_029917 OrthoDB
## 12 AT4G24220 FUN_029968 OrthoDB
## 13 AT1G01030 FUN_025493 RBH
## 14 AT1G01040 FUN_011748 RBH
## 15 ATMG01250 FUN_040221 RBH
## 16 ATMG01360 FUN_026804 RBH
## 17 AT1G01210 FUN_011652 ensembl-compara
## 18 AT1G01225 FUN_011648 ensembl-compara
## 19 AT5G67620 FUN_021483 ensembl-compara
## 20 ATMG00310 FUN_030126 ensembl-compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName1",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag1", "flag2")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2611684 139.5 7524444 401.9 11756942 627.9
## Vcells 50980318 389.0 152524140 1163.7 190628107 1454.4
## [1] 22172
## [1] 21950
##
## ensembl-compara FastOMA MCScanX OrthoDB RBH
## 4367 45924 19709 38228 25594
if (flag1 == 1) {
source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods")
# Print or save the plot
print(upset_plot)>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
## Key: <from_geneID, to_geneID>
## from_geneID to_geneID FastOMA MCScanX OrthoDB RBH ensembl-compara
## <char> <char> <lgcl> <lgcl> <lgcl> <lgcl> <lgcl>
## 1: AT1G29950.uORF2 FUN_032407 FALSE TRUE FALSE FALSE FALSE
## 2: AT4G19110.uORF1 FUN_023847 FALSE TRUE FALSE FALSE FALSE
## 3: AT5G09460.uORF1 FUN_028837 FALSE TRUE FALSE FALSE FALSE
## 4: AT5G45430.uORF1 FUN_023847 FALSE TRUE FALSE FALSE FALSE
## 5: AT5G64340.uORF1 FUN_028837 FALSE TRUE FALSE FALSE FALSE
## count_evidence
## <num>
## 1: 1
## 2: 1
## 3: 1
## 4: 1
## 5: 1
# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]
combined = gmm[, .(
BINCODE = paste(unique(BINCODE), collapse = " | "),
NAME = paste(unique(NAME), collapse = " | "),
DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]
charToRaw(combined$IDENTIFIER[1])## [1] 27 66 75 6e 5f 30 31 33 33 39 34 2d 74 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE) # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE) # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE) # change as needed
charToRaw(combined$IDENTIFIER[1])## [1] 66 75 6e 5f 30 31 33 33 39 34 2d 74 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2)) # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)##
## FALSE TRUE
## 16470 23868
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "RBH" "ensembl-compara" "from_count"
## [9] "to_count" "count_evidence" "ath_BINCODE" "ath_NAME"
## [13] "ath_DESCRIPTION" "athName" "athSynonims" "all_pathways"
## [17] "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE TRUE
## 76691 2
## [1] "FUN_040149" "FUN_040149"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName1",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag1", "flag2")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3216619 171.8 7524444 401.9 11756942 627.9
## Vcells 45501716 347.2 152524140 1163.7 190628107 1454.4
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 22167
## [1] 21948
## [1] 1 122
## [1] 1 115
## [1] 242
## [1] 131
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag1 == 1) {
methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) { # make flags
methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 57035 12115 2872
##
## 100% match no match partial match
## 1 28882 10077 2401
## 2 9643 1295 248
## 3 8304 454 119
## 4 8447 244 84
## 5 1759 45 20
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
# Initialize an empty set or list covered_genes.
#
# For each method in the list:
# ["MCScanX", "ensembl-compara", "PLAZA", "OrthoDB", "RBH", "FastOMA"]
#
# For every row in the data table dt:
#
# a. Check if filter_criteria is "reject" for the row.
#
# b. Check if the value of the column corresponding to method in this row is TRUE.
#
# c. Check if from_geneID in this row is not in covered_genes.
#
# d. If method is one of ["OrthoDB", "RBH", "FastOMA"]:
# Check if gene pair is covered by all three methods:
# If yes:
# is_candidate = TRUE
# new_criteria = "OrthoDB_FastOMA_RBH"
# Else if gene pair is covered by any two of those three methods:
# If yes:
# is_candidate = TRUE
# new_criteria = join the two method names with underscore, e.g. "OrthoDB_FastOMA" or "RBH_FastOMA" or "OrthoDB_RBH"
# Else if MapMan4_Match string contains "match based on" and current method name:
# is_candidate = TRUE
# new_criteria = {method}_MapMan4 (e.g., "RBH_MapMan4")
# Else:
# is_candidate = FALSE
special_methods = c("OrthoDB", "RBH", "FastOMA")
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## OrthoDB_MapMan4 RBH_MapMan4 FastOMA_MapMan4
## 5081 1779 3127
## #### #### #### ####
##
## ensembl-compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 1319 3127 2137 893
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 19814 3065 5081 1317
## RBH_MapMan4 reject
## 1779 33490
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 34942 2583 1007
##
## 100% match no match partial match
## 1 11045 883 634
## 2 6524 1002 163
## 3 7269 411 106
## 4 8345 242 84
## 5 1759 45 20
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag1 == 1) {
source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) { # make flags
source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 19864
## [1] 20842
## [1] 1 59
## [1] 1 96
## [1] 11
## [1] 15
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## ensembl-compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 49 113 79 36
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 795 84 109 61
## RBH_MapMan4 reject
## 28 1414
openxlsx::write.xlsx(pss_long,
paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)# Step 1: params_list
# params_list <- list(
# ...
# )
#
# Step 2: YAML header in 09_fruitTrees-child.Rmd
# ---
# title: "fruitTrees Child"
# output: html_document
# params:
# plantName1: NULL
# plantName2: NULL
# plantName3: NULL
# plantName4: NULL
# plantDirIn: NULL
# plantNameOut: NULL
# plantDirOut: NULL
# pattern_in: NULL
# pattern_out: NULL
# compara_pattern_in1: NULL
# compara_pattern_in2: NULL
# plaza_pattern_in1: NULL
# plaza_pattern_in2: NULL
# ref_genome: NULL
# mercator: NULL
# mercatorPatternIn1: NULL
# mercatorPatternOut1: NULL
# mercatorPatternIn2: NULL
# mercatorPatternOut2: NULL
# ---
#
#
# access params in the script like:
# params$plantName1
# params$plantDirOut
#
# Step 3: Call render() like
# rmarkdown::render(
# input = "09_fruitTrees-child.Rmd",
# params = params_list,
# envir = new.env(parent = globalenv()), # optional: isolate execution
# output_format = "html_document",
# quiet = FALSE
# )
#
#
# This will:
# Inject params_list into params$...
# Knit the child Rmd in a separate process
# Print progress to the console (quiet = FALSE)
# Save an .html file to the working directory## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8
## [2] LC_CTYPE=English_United Kingdom.utf8
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.utf8
##
## time zone: Europe/Ljubljana
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ComplexUpset_1.3.3 ggplot2_3.5.2 knitr_1.50 data.table_1.17.0
## [5] magrittr_2.0.3
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 crayon_1.5.3 dplyr_1.1.4
## [5] compiler_4.4.1 zip_2.3.2 Rcpp_1.0.14 tidyselect_1.2.1
## [9] stringr_1.5.1 dichromat_2.0-0.1 jquerylib_0.1.4 textshaping_1.0.1
## [13] systemfonts_1.2.3 scales_1.4.0 yaml_2.3.10 fastmap_1.2.0
## [17] R6_2.6.1 labeling_0.4.3 patchwork_1.3.0 generics_0.1.4
## [21] igraph_2.1.4 openxlsx_4.2.8 tibble_3.2.1 bslib_0.9.0
## [25] pillar_1.10.2 RColorBrewer_1.1-3 rlang_1.1.5 utf8_1.2.5
## [29] cachem_1.1.0 stringi_1.8.7 xfun_0.52 sass_0.4.10
## [33] cli_3.6.3 withr_3.0.2 digest_0.6.37 grid_4.4.1
## [37] rstudioapi_0.17.1 lifecycle_1.0.4 vctrs_0.6.5 evaluate_1.0.3
## [41] glue_1.8.0 farver_2.1.2 ragg_1.4.0 colorspace_2.1-1
## [45] rmarkdown_2.29 tools_4.4.1 pkgconfig_2.0.3 htmltools_0.5.8.1